{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f01396a2",
   "metadata": {},
   "source": [
    "## 1. 马尔可夫奖励过程"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c5c2f6b",
   "metadata": {},
   "source": [
    "### 1.1 MRP代码实现"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae55f61d",
   "metadata": {},
   "source": [
    "<img src=\"../images/MRP1.png\" width=\"50%\"></img>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "15b5d3e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_next_state_value(discount_factor, rewards, old_state_values, transition_probabilities):\n",
    "    # 创建一个空列表new_state_values，用于存储计算得到的下一阶段的状态值函数。\n",
    "    new_state_values = []\n",
    "    # 使用for循环遍历old_state_values列表的索引，current_state变量表示当前状态的索引。\n",
    "    for current_state in range(len(old_state_values)):\n",
    "        # 检查当前状态是否为最后一个状态，如果不是最后一个状态，执行以下代码块。\n",
    "        if current_state != len(old_state_values) - 1:\n",
    "            # 创建一个变量next_state_value_sum，用于累加与当前状态相连的后续状态值函数的加权和。\n",
    "            next_state_value_sum = 0.0\n",
    "            # 使用for循环遍历transition_probabilities[current_state]列表中的每个元素，transition变量表示状态转移概率的一个元素。\n",
    "            for transition in transition_probabilities[current_state]:\n",
    "                # 从transition元素中获取转移概率和下一状态的索引，并将其加权乘以下一状态的旧值函数，然后累加到next_state_value_sum上。\n",
    "                transition_probability = transition[0]\n",
    "                next_state_index = transition[1]\n",
    "                next_state_value_sum += transition_probability * old_state_values[next_state_index]\n",
    "\n",
    "            # 根据当前状态的即时奖励和折扣因子，计算当前状态在下一阶段的预期收益，并将其添加到new_state_values列表中。\n",
    "            new_state_value = rewards[current_state] + discount_factor * next_state_value_sum\n",
    "            new_state_values.append(new_state_value)\n",
    "\n",
    "    # 在new_state_values列表末尾添加一个值为0的元素，表示Sleep状态的值函数为0。\n",
    "    new_state_values.append(0.0)\n",
    "    return new_state_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5463befd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 定义γ\n",
    "discount_factor = 1\n",
    "\n",
    "# 报酬顺序：Class 1、Class 2、Class 3、Pass、Pub、Facebook、Sleep\n",
    "rewards = [-2., -2., -2., 10., 1., -1., 0.]\n",
    "\n",
    "# 全0初始化值函数\n",
    "old_state_values = [0, 0, 0, 0, 0, 0, 0]\n",
    "\n",
    "'''\n",
    "定义了一个嵌套列表transition_probabilities，表示状态之间的转移概率\n",
    "每个子列表表示当前状态的后续状态以及对应的转移概率。\n",
    "transition_probabilities[0]表示Class 1的状态后续状态的转移概率，\n",
    "其中[0.5, 1]表示以0.5的概率转移到Class 2，[0.5, 5]表示以0.5的概率转移到Facebook。\n",
    "'''\n",
    "transition_probabilities = [\n",
    "    [[0.5, 1], [0.5, 5]],\n",
    "    [[0.8, 2], [0.2, 6]],\n",
    "    [[0.6, 3], [0.4, 4]],\n",
    "    [[1, 6]],\n",
    "    [[0.2, 0], [0.4, 1], [0.4, 2]],\n",
    "    [[0.1, 0], [0.9, 5]],\n",
    "    [[0, 0]]\n",
    "]\n",
    "# new_state_values用于存储每次迭代得到的新的状态值函数。\n",
    "new_state_values = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "31eb57ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(100):\n",
    "    new_state_values = calculate_next_state_value(discount_factor, rewards, old_state_values, transition_probabilities)\n",
    "    # 使用新生成的值函数列表替换旧的值函数列表\n",
    "    old_state_values = new_state_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b15efef6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-12.418287702397645, 1.4703089405374907, 4.3371337110593915, 10.0, 0.8410368812854547, -22.318009521720207, 0.0]\n"
     ]
    }
   ],
   "source": [
    "print(new_state_values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9768d1ef",
   "metadata": {},
   "source": [
    "<img src=\"../images/mrp2.png\" width=\"50%\"></img>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f021e23",
   "metadata": {},
   "source": [
    "### 1.2 MRP矩阵形式求解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9cd1477f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-12.54320988]\n",
      " [  1.45679012]\n",
      " [  4.32098765]\n",
      " [ 10.        ]\n",
      " [  0.80246914]\n",
      " [-22.54320988]\n",
      " [  0.        ]]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "# 折扣因子γ\n",
    "discount_factor = 1\n",
    "\n",
    "# 状态转移概率矩阵\n",
    "transition_matrix = np.array([[0, 0.5, 0, 0, 0, 0.5, 0],\n",
    "                             [0, 0, 0.8, 0, 0, 0, 0.2],\n",
    "                             [0, 0, 0, 0.6, 0.4, 0, 0],\n",
    "                             [0, 0, 0, 0, 0, 0, 1],\n",
    "                             [0.2, 0.4, 0.4, 0, 0, 0, 0],\n",
    "                             [0.1, 0, 0, 0, 0, 0.9, 0],\n",
    "                             [0, 0, 0, 0, 0, 0, 0]])\n",
    "\n",
    "# 即时奖励矩阵\n",
    "rewards = np.array([[-2], [-2], [-2], [10], [1], [-1], [0]])\n",
    "\n",
    "# 构造单位矩阵\n",
    "identity_matrix = np.eye(7)\n",
    "\n",
    "# 直接使用np.linalg.inv()函数求解逆矩阵，并使用@操作符进行矩阵乘法运算\n",
    "value_matrix = np.linalg.inv(identity_matrix - discount_factor * transition_matrix) @ rewards\n",
    "\n",
    "# 打印输出计算得到的状态值函数矩阵\n",
    "print(value_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6eeb7c8",
   "metadata": {},
   "source": [
    "## 2. 马尔可夫决策过程"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55ef321e",
   "metadata": {},
   "source": [
    "### 2.1 MDP代码实现"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6740efb6",
   "metadata": {},
   "source": [
    "<img src=\"../images/mdp1.png\" width=\"50%\"></img>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "1e5997b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_next_state_value(probability, rewards, old_values, transition_probabilities):\n",
    "    # 创建一个空列表new_values，用于存储计算得到的下一阶段的状态值函数。\n",
    "    new_values = []\n",
    "    for state_index in range(len(old_values)):\n",
    "        if state_index != len(old_values) - 1:\n",
    "            state_sum = 0.0\n",
    "            for transition_index in range(len(transition_probabilities[state_index])):\n",
    "                if isinstance(transition_probabilities[state_index][transition_index][0], list):\n",
    "                    # 如果转移是一个列表，则需要考虑多个子转移的情况\n",
    "                    transition_sum = 0.0\n",
    "                    for sub_transition in transition_probabilities[state_index][transition_index][0]:\n",
    "                        transition_sum += old_values[sub_transition[0]] * sub_transition[1]\n",
    "                    state_sum += probability * (rewards[transition_probabilities[state_index][transition_index][1]]\n",
    "                                                + transition_sum)\n",
    "                else:\n",
    "                    # 单个转移的情况\n",
    "                    state_sum += probability * (rewards[transition_probabilities[state_index][transition_index][1]]\n",
    "                                                + old_values[transition_probabilities[state_index][transition_index][0]])\n",
    "            new_values.append(state_sum)\n",
    "\n",
    "    new_values.append(0.0)\n",
    "    return new_values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f6de7e50",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 策略参数，控制状态转移概率的权重\n",
    "probability_pi = 0.5\n",
    "# 定义即时奖励列表：study pass pub facebook quit sleep\n",
    "rewards_list = [-2., 10., 1., -1., 0., 0.]\n",
    "# 定义旧状态值列表\n",
    "old_values_list = [0, 0, 0, 0, 0]\n",
    "# 定义转移概率列表\n",
    "transition_probabilities = [[[1, 0], [3, 3]],\n",
    "                            [[2, 0], [4, 5]],\n",
    "                            [[[[0, 0.2], [1, 0.4], [2, 0.4]], 2], [4, 1]],\n",
    "                            [[0, 4], [3, 3]],\n",
    "                            []]\n",
    "# new_values_list用于存储迭代计算得到的新状态值。\n",
    "new_values_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9f4da650",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(100):\n",
    "    new_values_list = calculate_next_state_value(probability_pi, rewards_list, old_values_list, transition_probabilities)\n",
    "    old_values_list = new_values_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d276c43b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-1.3076923099959044, 2.6923076920317515, 7.384615384159404, -2.3076923112229597, 0.0]\n"
     ]
    }
   ],
   "source": [
    "print(new_values_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3747fee1",
   "metadata": {},
   "source": [
    "<img src=\"../images/mdp2.png\" width=\"50%\"></img>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea977270",
   "metadata": {},
   "source": [
    "### 2.2 MDP矩阵形式求解"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "9b721e36",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_state_value(pi, lambd, transition_probabilities, rewards, identity_matrix):\n",
    "    p_mat = np.matrix(transition_probabilities)\n",
    "    r_mat = np.matrix(rewards)\n",
    "    i_mat = np.matrix(identity_matrix)\n",
    "    # 根据Bellman方程计算状态值\n",
    "    v_mat = (i_mat - lambd * p_mat).I * r_mat\n",
    "    return v_mat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "37121039",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-1.30769231]\n",
      " [ 2.69230769]\n",
      " [ 7.38461538]\n",
      " [-2.30769231]\n",
      " [ 0.        ]]\n"
     ]
    }
   ],
   "source": [
    "pi = 0.5\n",
    "lambd = 1\n",
    "\n",
    "transition_probabilities = [[0, pi, 0, pi, 0],\n",
    "                            [0, 0, pi, 0, pi],\n",
    "                            [pi*0.2, pi*0.4, pi*0.4, 0, pi],\n",
    "                            [pi, 0, 0, pi, 0],\n",
    "                            [0, 0, 0, 0, 0]]\n",
    "rewards = [[pi*-2 + pi*-1],\n",
    "           [pi*-2 + pi*0],\n",
    "           [pi*1 + pi*10],\n",
    "           [pi*0 + pi*-1],\n",
    "           [0]]\n",
    "\n",
    "identity_matrix = np.eye(5)\n",
    "\n",
    "state_values = calculate_state_value(pi, lambd, transition_probabilities, rewards, identity_matrix)\n",
    "\n",
    "print(state_values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d15e690",
   "metadata": {},
   "source": [
    "## 3. 贝尔曼最优方程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "00530fa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculate_next_state_value(policy_param, rewards_list, old_values_list, weight_list):\n",
    "    new_values_list = []\n",
    "    for state_index in range(len(old_values_list)):\n",
    "        if state_index != len(old_values_list) - 1:\n",
    "            max_list = []\n",
    "            for transition_index in range(len(weight_list[state_index])):\n",
    "                if isinstance(weight_list[state_index][transition_index][0], list):\n",
    "                    transition_sum = 0.0\n",
    "                    for sub_transition in weight_list[state_index][transition_index][0]:\n",
    "                        transition_sum += old_values_list[sub_transition[0]] * sub_transition[1]\n",
    "                    max_list.append(policy_param * (rewards_list[weight_list[state_index][transition_index][1]] + transition_sum))\n",
    "                else:\n",
    "                    max_list.append(policy_param * (rewards_list[weight_list[state_index][transition_index][1]]\n",
    "                                                    + old_values_list[weight_list[state_index][transition_index][0]]))\n",
    "            new_values_list.append(max(max_list))\n",
    "\n",
    "    new_values_list.append(0.0)\n",
    "    return new_values_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5d9bb125",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[6.0, 8.0, 10.0, 6.0, 0.0]\n"
     ]
    }
   ],
   "source": [
    "policy_param = 1  # 策略参数，控制状态转移概率的权重\n",
    "# 即时奖励列表，表示每个状态的即时奖励：study pass pub facebook quit sleep\n",
    "rewards_list = [-2., 10., 1., -1., 0., 0.]  \n",
    "old_values_list = [0, 0, 0, 0, 0]  # 上一轮迭代的状态值列表\n",
    "weight_list = [[[1, 0], [3, 3]],\n",
    "               [[2, 0], [4, 5]],\n",
    "               [[[[0, 0.2], [1, 0.4], [2, 0.4]], 2], [4, 1]],\n",
    "               [[0, 4], [3, 3]],\n",
    "               []]  # 权重列表，描述状态转移和即时奖励的对应关系\n",
    "\n",
    "new_values_list = []\n",
    "for i in range(100):\n",
    "    new_values_list = calculate_next_state_value(policy_param, rewards_list, old_values_list, weight_list)\n",
    "    old_values_list = new_values_list\n",
    "\n",
    "print(new_values_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "381694bb",
   "metadata": {},
   "source": [
    "<img src=\"../images/ovf.png\" width=\"50%\"></img>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c480077f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
